package ClusterPackage;

/**
 * Created by LZG on 2016/1/14.
 */

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.text.DecimalFormat;
import java.util.*;

public class ModelClass {

    List<String> filesList;//用户上传的文件名称列表
    private int filesSize = 1000;// 用户上传的文件数ֵ
    private int fileCount;// 文本计数器

    // 结点集，记录结点出现数目便于计算权值
    HashMap<String, Integer> nodesMap;

    // 边集，用于统计边数目和记录边的最终权值
    HashMap<String, HashMap<String, Double>> edgesMap;

    private double value = 0.001;// TFIDF特征值阈值
    private static final DecimalFormat doubleFormat = new DecimalFormat("0.000");// 格式化词汇频率，四舍五入
    private ArrayList<MSTEdge> MST = new ArrayList<MSTEdge>();// 存储最小生成树

    //无session
    //测试使用
    ModelClass(){
        //此处initialsize 使PathClass初始化，生成静态路径
        PathClass.initialize();
        // 获取文件名列表
        filesSize=PathClass.findAllFiles(PathClass.initialFolder).size();
        filesList=PathClass.findAllFiles(PathClass.initialFolder);

        // 创建相似度文件夹
        PathClass.createFolder_sim();

    }

    // 建立文本映射模型
    public void buildModel() {


        // 建立文本映射模型
        buildTextModel();
        System.out.println("文本模型建立");

        // 计算相似度
        getSimilarity();
        System.out.println("相似度计算完成");

        //以下属于相似度计算
        // 创建最小生成树MST
        /*
        MST = convertTreeSetToArrayList(createMST(PathClass.similarityFolder));//计算最小生成树
        MST = readMST();// 从文本读取最小生成树
        System.out.println("创建最小生成树MST");

        // 根据聚簇数目聚类
        ArrayList<MSTEdge> ts1 = clusteringByNumber(MST, 10);

        // 根据相似度阈值聚类
        //ArrayList<MSTEdge> ts2 = clusteringByValue(MST, 0.1);
        System.out.println("聚类完成");

        // 输出聚类
        //showCluster(MSTFolder+"test.txt",MST);
        showCluster(PathClass.MSTFolder+"clusterResultByNumber.txt",ts1);
        //showCluster(MSTFolder+"clusterResultByValue.txt",ts2);
        */
    }

    // 求最大公共子图MCG
    // 计算文本相似度
    public double getMCG(String fileName1, String fileName2) {
        // 求公共结点
        HashMap<String, Integer> nodesMapMCG = getCommonNodes(fileName1, fileName2);

        // 输出公共结点
        outPutHashMap(PathClass.mcgNodesFolder + fileName1 + "And" + fileName2,
                nodesMapMCG);

        // 求公共边
        HashMap<String, HashMap<String, Double>> edgesMapMCG = getCommonEdges(
                fileName1, fileName2, nodesMapMCG);

        // 输出公共边
        outPutEdges(PathClass.mcgEdgesFolder + fileName1 + "And" + fileName2,
                edgesMapMCG);

        // 计算文本相似度
        return calculateSimilarity3(fileName1, fileName2, nodesMapMCG, edgesMapMCG);
    }

    // 获取结点信息
    public HashMap<String, Integer> getNodes(String path) {
        HashMap<String, Integer> nodesMap = new HashMap<String, Integer>();
        try {

            BufferedReader reader = new BufferedReader(new FileReader(new File(
                    path)));
            for (String line = reader.readLine(); line != null; line = reader
                    .readLine()) {
                int mark = line.indexOf('\t');
                nodesMap.put(line.substring(0, mark),
                        Integer.valueOf(line.substring(mark + 1)));
            }
            reader.close();

        } catch (Exception e) {
        }
        return nodesMap;
    }

    // 记录结点集并输出到文档
    public void recordNodes(String fileName, HashMap<String, Integer> nodesMap) {
        try {
            BufferedReader reader = new BufferedReader(new FileReader(new File(
                    PathClass.finalValueFolder + fileName)));

            BufferedWriter writer = new BufferedWriter(new FileWriter(new File(
                    PathClass.nodesFolder + fileName), false));
            for (String line = reader.readLine(); line != null; line = reader
                    .readLine()) {
                int first = line.indexOf('\t');
                String word = line.substring(0, first);
                // double
                // TF=Double.valueOf(line.substring(first+1,line.indexOf('\t',
                // first+1)));

                double TF_IDF = Double.valueOf(line.substring(line
                        .lastIndexOf('\t') + 1));

                // 特征值达到阈值
                if (TF_IDF > value) {

                    // double termCount= nodesMap.get(word);
                    // nodesMap.put(word,termCount);

                    // 将nodesMap当前对象输出到文档
                    // writer.write(word + '\t' + nodesMap.get(word) + '\t' +
                    // TF_IDF);
                    writer.write(word + '\t' + nodesMap.get(word));
                    writer.newLine();

                    writer.flush();
                } else {
                    // 不输出到文档且需从nodesMap中删除当前对象
                    if (nodesMap.containsKey(word)) {
                        nodesMap.remove(word);
                    }
                }

            }// end for

            reader.close();
            writer.close();
        } catch (Exception e) {
        }

    }

    // 从finalText建立边集
    public HashMap<String, HashMap<String, Double>> BuildEdges(String fileName) {
        HashMap<String, HashMap<String, Double>> edgesMap = new HashMap<String, HashMap<String, Double>>();
        //HashSet<String> effPunctList;// 活跃标点集
        //HashSet<String> allPunctList;// 所有标点集
        String preWord = "NOTHING";// 前驱特征项
        String currentWord = "NOTHING";// 当前特征项

        int status;
        // 状态字0:句首，词汇不构成特征关系，只建立结点
        // 状态字1:非句首，词汇构成特征关系，建立结点和边
        try {
            BufferedReader finalTextReader = new BufferedReader(new FileReader(
                    new File(PathClass.finalTextFolder + fileName)));

            // 读取活跃标点列表
            //effPunctList = getEffectivePunctuationList();
            // 读取所有标点列表
            //allPunctList = getAllPunctuationList();
            // 记录边
            // 读取一行
            for (String line = finalTextReader.readLine(); line != null; line = finalTextReader
                    .readLine()) {
                status = 0;

                // 以空格为标记逐词读取
                // end = line.indexOf(' ', start);
                // currentWord = line.substring(start, end);

                StringTokenizer st = new StringTokenizer(line, " ");

                while (st.hasMoreTokens()) {
                    currentWord = st.nextToken();

                    // 读取到词汇
                    //if (!allPunctList.contains(currentWord)) {
                    if(!currentWord.equals("。")){

                        // 记录边
                        if (status == 1) {
                            recordEdges(preWord, currentWord, edgesMap);
                        } else {
                            // 若status为0,则不记录边，但读取词汇后status必然为1
                            status = 1;
                        }
                        preWord = currentWord;

                    }

                    // 读取到标点
                    // 活跃标点阻断词汇间关联,status=0
                    // 非活跃标点""等不会阻断关联,status保持不变
                    else {
                        status = 0;
                        preWord = "NOTHING";
                    }

                }// end while
            }// end for

            finalTextReader.close();

        } catch (Exception e) {
        }
        return edgesMap;
    }

    // 从文本获取边集
    public HashMap<String, HashMap<String, Double>> getEdges(String path) {
        HashMap<String, HashMap<String, Double>> edgesMap = new HashMap<String, HashMap<String, Double>>();
        try {
            BufferedReader reader = new BufferedReader(new FileReader(new File(
                    path)));

            // 纵向读取
            for (String line = reader.readLine(); line != null; line = reader
                    .readLine()) {
                HashMap<String, Double> tempMap = new HashMap<String, Double>();
                int mark = line.indexOf('\t');
                String preNode = line.substring(0, mark);
                line = line.substring(mark + 1);// 剪切掉前驱结点便于处理

                // 逐个横向读取后继结点
                StringTokenizer st = new StringTokenizer(line, "\t");
                while (st.hasMoreTokens()) {
                    String s = st.nextToken();
                    int tag = s.indexOf('$');
                    String sucNode = s.substring(0, tag);
                    double weight = Double.valueOf(s.substring(tag + 1));

                    tempMap.put(sucNode, weight);
                }// end while

                if (tempMap.size() != 0) {
                    edgesMap.put(preNode, tempMap);
                }

            }// end for
            reader.close();

        } catch (Exception e) {
        }
        return edgesMap;
    }

    // 记录一条边
    public void recordEdges(String preNode, String sucNode,
                            HashMap<String, HashMap<String, Double>> edgesMap) {
        // HashMap<String,Double> tempMap=new HashMap<String,Double>();
        HashMap<String, Double> tempMap;// 存储边的信息
        // 查询前驱结点是否存在
        if (edgesMap.containsKey(preNode)) {
            tempMap = edgesMap.get(preNode);

            // 查询preNode->sucNode的边是否存在
            if (tempMap.containsKey(sucNode)) {
                double edgeCount = tempMap.get(sucNode);// 获取边的计数值
                tempMap.put(sucNode, edgeCount + 1);
            } else {// 边不存在，则新增边
                tempMap.put(sucNode, 1.0);
            }
        } else {// 若前驱结点不存在,则边也不可能存在
            tempMap = new HashMap<String, Double>();
            tempMap.put(sucNode, 1.0);
        }
        edgesMap.put(preNode, tempMap);

    }

    // 读取活跃标点列表
    public static HashSet<String> getEffectivePunctuationList() {
        String effectivePunctuationPath = "files//effectivePunctuationList.txt";
        HashSet<String> punctuationList = new HashSet<String>();

        try {
            BufferedReader reader = new BufferedReader(new FileReader(new File(
                    effectivePunctuationPath)));
            // 添加有效词汇
            for (String line = reader.readLine(); line != null; line = reader
                    .readLine()) {
                punctuationList.add(line);
            }

            reader.close();
        } catch (Exception e) {

        }
        return punctuationList;
    }

    // 读取所有标点列表
    public static HashSet<String> getAllPunctuationList() {
        String allPunctuationPath = "files//allPunctuationList.txt";
        HashSet<String> punctuationList = new HashSet<String>();

        try {
            BufferedReader reader = new BufferedReader(new FileReader(new File(
                    allPunctuationPath)));
            // 添加有效词汇
            for (String line = reader.readLine(); line != null; line = reader
                    .readLine()) {
                punctuationList.add(line);
            }
            reader.close();
        } catch (Exception e) {

        }
        return punctuationList;
    }

    public void outPutHashSet(String path , HashSet<String> hashSet) {
        try {
            BufferedWriter writer = new BufferedWriter(new FileWriter(new File(
                    path), false));
            Iterator<String> iterator = hashSet.iterator();
            while (iterator.hasNext()) {
                // Map.Entry<T> entry = (Map.Entry<T>) iterator.next();
                String entry = iterator.next();
                // writer.write(entry.getKey() + "\t" + entry.getValue());
                writer.write(entry);
                if (iterator.hasNext()) {
                    writer.newLine();
                }
                writer.flush();
            }
            writer.close();
        } catch (Exception e) {
        }
    }

    // 输出HashMap到文档
    public void outPutHashMap(String path, HashMap<String,Integer> hashMap) {
        try {
            BufferedWriter writer = new BufferedWriter(new FileWriter(new File(
                    path), false));
            Iterator<Map.Entry<String,Integer>> iterator = hashMap.entrySet().iterator();
            while (iterator.hasNext()) {
                // Map.Entry<T> entry = (Map.Entry<T>) iterator.next();
                Map.Entry<String,Integer> entry = iterator.next();
                // writer.write(entry.getKey() + "\t" + entry.getValue());
                writer.write(entry.getKey()+'\t'+entry.getValue());
                if (iterator.hasNext()) {
                    writer.newLine();
                }
                writer.flush();
            }
            writer.close();
        } catch (Exception e) {
        }
    }


    // 输出边集到文档
    public void outPutEdges(String path,HashMap<String, HashMap<String, Double>> edgesMap) {
        try {
            BufferedWriter writer = new BufferedWriter(new FileWriter(new File(path), false));
            Iterator<Map.Entry<String, HashMap<String, Double>>> nodesIterator = edgesMap
                    .entrySet().iterator();
            while (nodesIterator.hasNext()) {
                Map.Entry<String, HashMap<String, Double>> entry = nodesIterator
                        .next();

                // 输出前驱结点
                String outPut = entry.getKey();

                // 横向输出后继结点以及统计数
                Iterator<Map.Entry<String, Double>> edgesIterator = entry
                        .getValue().entrySet().iterator();
                while (edgesIterator.hasNext()) {
                    Map.Entry<String, Double> edge = edgesIterator.next();
                    outPut = outPut + '\t' + edge.getKey() + '$'
                            + edge.getValue();
                }
                writer.write(outPut);
                if (nodesIterator.hasNext()) {
                    writer.newLine();
                }
                writer.flush();
            }

            writer.close();
        } catch (Exception e) {
        }
    }

    // 计算边的权值
    // 输入结点集nodesMap，边集edgesMap，结果结果记录在edgesMap中
    public void calculateEdgesWeight(HashMap<String, Integer> nodesMap,
                                     HashMap<String, HashMap<String, Double>> edgesMap) {
        String a="",b="";
        int va=0,vb=0;
        try {
            Iterator<Map.Entry<String, HashMap<String, Double>>> nodesIterator = edgesMap
                    .entrySet().iterator();
            while (nodesIterator.hasNext()) {
                // 获取前驱结点
                Map.Entry<String, HashMap<String, Double>> entry = nodesIterator
                        .next();
                String preNode = entry.getKey();
                int preCount;
                if(nodesMap.containsKey(preNode)){
                    preCount = nodesMap.get(preNode);
                }
                else{
                    continue;
                }
                a=preNode;
                va=preCount;

                // 获取边集
                HashMap<String, Double> edges = entry.getValue();
                Iterator<Map.Entry<String, Double>> edgesIterator = edges
                        .entrySet().iterator();
                while (edgesIterator.hasNext()) {
                    Map.Entry<String, Double> edge = edgesIterator.next();
                    String sucNode = edge.getKey();
                    int sucCount;
                    if (nodesMap.containsKey(sucNode)) {
                        sucCount = nodesMap.get(sucNode);
                    }
                    else {
                        continue;
                    }

                    b=sucNode;
                    vb=sucCount;
                    if(a.equals("○")&&b.equals("年")){
                        va=va;
                    }

                    double edgeCount = edge.getValue();

                    // 计算边权值， 暂时为p(i,j)/(p(i)+p(j)-p(i,j))
                    // 此处待定
                    double edgeWeight = Double.valueOf(doubleFormat
                            .format(edgeCount
                                    / (preCount + sucCount - edgeCount)));
                    edges.put(sucNode, edgeWeight);
                }// end while
                edgesMap.put(preNode, edges);
            }// end while
        } catch (Exception e) {
            System.out.println("异常: "+a+" "+b+" "+va+" "+vb);
        }
    }

    //求公共结点集
    public HashMap<String, Integer> getCommonNodes(String fileName1,String fileName2) {
        // 结点集
        HashMap<String, Integer> nodesMap1, nodesMap2;
        HashMap<String, Integer> nodesMapMCG = new HashMap<String, Integer>();

        nodesMap1 = getNodes(PathClass.nodesFolder+fileName1);
        nodesMap2 = getNodes(PathClass.nodesFolder+fileName2);

        // 逐个读取文档file1中结点， 从nodesMap1匹配nodesMap2
        Iterator<Map.Entry<String, Integer>> nodesIterator = nodesMap1
                .entrySet().iterator();
        while (nodesIterator.hasNext()) {
            Map.Entry<String, Integer> entry = nodesIterator.next();
            String node = entry.getKey();

            //若存在公共结点
            if(nodesMap2.containsKey(node)){
                // node权值暂时为两个Map中频数较小者
                // 此处待定
                nodesMapMCG.put(node, Math.min(entry.getValue(), nodesMap2.get(node)));
            }
        }

        return nodesMapMCG;
    }

    //求公共边集，nodesMapMCG为file1与file2的公共结点
    public HashMap<String, HashMap<String, Double>> getCommonEdges(String fileName1,String fileName2,HashMap<String, Integer> nodesMapMCG){
        HashMap<String, HashMap<String, Double>> edgesMap1,edgesMap2;
        HashMap<String, HashMap<String, Double>> edgesMapMCG=new HashMap<String, HashMap<String, Double>>();

        edgesMap1=getEdges(PathClass.edgesFolder+fileName1);
        edgesMap2=getEdges(PathClass.edgesFolder+fileName2);

        //首先获取公共结点
        Iterator<Map.Entry<String, Integer>> nodesIterator=nodesMapMCG.entrySet().iterator();
        while(nodesIterator.hasNext()){
            Map.Entry<String, Integer> entry=nodesIterator.next();
            String preNode=entry.getKey();

            //查询是否具有相同的前驱结点
            //两个文本都包含以node为前驱的边且size!=0
            if(edgesMap1.containsKey(preNode)&&edgesMap2.containsKey(preNode)&&edgesMap1.get(preNode).size()!=0&&edgesMap2.get(preNode).size()!=0){

                HashMap<String,Double> MCGEdges=new HashMap<String,Double>();

                //查询是否具有相同的后继结点
                Iterator<Map.Entry<String, Double>> edgesIterator=edgesMap1.get(preNode).entrySet().iterator();
                while(edgesIterator.hasNext()){
                    Map.Entry<String, Double> edgeEntry=edgesIterator.next();
                    String sucNode=edgeEntry.getKey();
                    HashMap<String,Double> edges=edgesMap2.get(preNode);

                    //具有相同后继结点
                    if(edges.containsKey(sucNode)){
                        //edges存储公共边即第二级HashMap
                        //暂时获取相同边权值较小的
                        //此处待定
                        MCGEdges.put(sucNode,Math.min(edgeEntry.getValue(), edges.get(sucNode)));
                    }
                    //不具有共同后继结点
                    else{
                        //从edges中删除sucNode
                        //edges.remove(sucNode);
                    }
                }//end while
                //
                if(MCGEdges.size()!=0){
                    edgesMapMCG.put(preNode, MCGEdges);
                }
            }//end if

        }//end while

        return edgesMapMCG;
    }

    // 计算文本相似度1
    public void calculateSimilarity1(int file1,int file2,HashMap<String, Integer> nodesMapMCG,HashMap<String, HashMap<String, Double>> edgesMapMCG){
        double alpha=0.5;
        double weightSum=0;
        int nodesNumber1=getNodes(PathClass.nodesFolder+file1+".txt").size();
        int nodesNumber2=getNodes(PathClass.nodesFolder+file2+".txt").size();

        int edgesCount1=statisticsEdges(getEdges(PathClass.edgesFolder+file1+".txt"));
        int edgesCount2=statisticsEdges(getEdges(PathClass.edgesFolder+file2+".txt"));

        // 计算结点相似度
        // 暂时定义
        // 此处待定
        double nodesSimilarity=alpha*(double)nodesMapMCG.size()/Math.max(nodesNumber1, nodesNumber2);

        //计算边相似度
        //统计公共边
        Iterator<Map.Entry<String, HashMap<String, Double>>> nodeIterator=edgesMapMCG.entrySet().iterator();
        while(nodeIterator.hasNext()){
            Map.Entry<String, HashMap<String, Double>> nodeEntry=nodeIterator.next();
            String preNode=nodeEntry.getKey();
            Iterator<Map.Entry<String, Double>> edgeIterator=nodeEntry.getValue().entrySet().iterator();
            while(edgeIterator.hasNext()){
                Map.Entry<String, Double> edgeEntry=edgeIterator.next();
                String sucNode=edgeEntry.getKey();
                double weightMCG=edgeEntry.getValue();
                double weight1=checkWeight(PathClass.edgesFolder+file1+".txt",preNode,sucNode);
                double weight2=checkWeight(PathClass.edgesFolder+file2+".txt",preNode,sucNode);
                weightSum+=weightMCG/Math.max(weight1, weight2);
            }
        }
        // 暂时定义
        // 此处待定
        double edgesSimilarity=(1-alpha)*weightSum/Math.max(edgesCount1,edgesCount2);

        double similarity=nodesSimilarity+edgesSimilarity;


        System.out.println("算法1-文本"+file1+"与文本"+file2+"相似度："+similarity+" "+nodesSimilarity+" "+edgesSimilarity);
    }

    // 计算文本相似度2
    public void calculateSimilarity2(int file1,int file2,HashMap<String, Integer> nodesMapMCG,HashMap<String, HashMap<String, Double>> edgesMapMCG){
        double alpha=0.5;
        int nodesCount1=calculateNodes(getNodes(PathClass.nodesFolder+file1+".txt"));
        int nodesCount2=calculateNodes(getNodes(PathClass.nodesFolder+file2+".txt"));
        int nodesCountMCG=calculateNodes(nodesMapMCG);

        double edgesCount1=calculateEdges(getEdges(PathClass.edgesFolder+file1+".txt"));
        double edgesCount2=calculateEdges(getEdges(PathClass.edgesFolder+file2+".txt"));
        double edgesCountMCG=calculateEdges(edgesMapMCG);

        // 算法2
        double nodesSimilarity=alpha*(double)nodesCountMCG/Math.max(nodesCount1, nodesCount2);
        double edgesSimilarity=(1-alpha)*edgesCountMCG/Math.max(edgesCount1,edgesCount2);
        double similarity=nodesSimilarity+edgesSimilarity;
        System.out.println("算法2-文本"+file1+"与文本"+file2+"相似度："+similarity+" "+nodesSimilarity+" "+edgesSimilarity);

    }

    // 计算文本相似度3
    public double calculateSimilarity3(String fileName1, String fileName2,HashMap<String, Integer> nodesMapMCG,HashMap<String, HashMap<String, Double>> edgesMapMCG) {
        double alpha = 0.5;
        int nodesCount1 = calculateNodes(getNodes(PathClass.nodesFolder + fileName1));
        int nodesCount2 = calculateNodes(getNodes(PathClass.nodesFolder + fileName2));
        int nodesCountMCG = calculateNodes(nodesMapMCG);

        double edgesCount1 = calculateEdges(getEdges(PathClass.edgesFolder + fileName1));
        double edgesCount2 = calculateEdges(getEdges(PathClass.edgesFolder + fileName2));
        double edgesCountMCG = calculateEdges(edgesMapMCG);

        // 算法3
        //double nodesSimilarity = alpha * 0.5 * ((double) nodesCountMCG / Math.max(nodesCount1, nodesCount2) + (double) nodesCountMCG / Math.min(nodesCount1, nodesCount2));
        //double edgesSimilarity = (1 - alpha) * 0.5 * (edgesCountMCG / Math.max(edgesCount1, edgesCount2) + edgesCountMCG / Math.min(edgesCount1, edgesCount2));

        double nodesSimilarity = alpha * 0.5 * ((double) nodesCountMCG / nodesCount1 + (double) nodesCountMCG / nodesCount2);
        double edgesSimilarity = (1 - alpha) * 0.5 * (edgesCountMCG / edgesCount1 + edgesCountMCG / edgesCount2);
        double similarity = nodesSimilarity + edgesSimilarity;
        return similarity;

        // System.out.println("算法3-文本" + file1 + "与文本" + file2 + "相似度："  + similarity + " " + nodesSimilarity + " " + edgesSimilarity);
		/*
		try{
			BufferedWriter writer = new BufferedWriter(new FileWriter(new File(
					similarityFolder + "similarity.txt"), true));
			writer.write(file1 + "\t" + file2 + "\t" + doubleFormat.format(similarity));
			writer.newLine();
			writer.flush();
			writer.close();
		}
		catch(Exception e){}
		*/
    }



    // 测试
    // 统计边数量
    public int statisticsEdges(HashMap<String, HashMap<String, Double>> edgesMapMCG){
        Iterator<Map.Entry<String, HashMap<String, Double>>> nodeIterator=edgesMapMCG.entrySet().iterator();
        int edgesCount=0;

        while(nodeIterator.hasNext()){
            Map.Entry<String, HashMap<String, Double>> nodeEntry=nodeIterator.next();
            Iterator<Map.Entry<String, Double>> edgeIterator=nodeEntry.getValue().entrySet().iterator();
            while(edgeIterator.hasNext()){
                edgesCount++;
                edgeIterator.next();
            }
        }
        return edgesCount;
    }

    // 测试
    // 从文本查询边权值
    public double checkWeight(String path,String preNode,String sucNode){
        try{
            BufferedReader reader = new BufferedReader(new FileReader(new File(path)));
            for (String line = reader.readLine(); line != null; line = reader
                    .readLine()) {
                int mark = line.indexOf('\t');
                if(preNode.equals(line.substring(0,mark))){
					/*
					StringTokenizer st = new StringTokenizer(line,"$");
					while (st.hasMoreTokens()){
						String s=st.nextToken();
						int tag=s.indexOf('\t');
						String word=s.substring(tag+1);
						if(sucNode.equals(word)){

						}
					}
					*/
                    // 从mark位置查询后继结点，查询不到返回-1
                    int tag=line.indexOf(sucNode,mark);
                    if(tag!=-1){
                        int start=line.indexOf('$', tag);//preNode长度不可知，故获取'$'位置
                        int end=line.indexOf('\t', tag);

                        if(end==-1){//文本中最后一个边之后没有'\t'，end=-1
                            return Double.valueOf(line.substring(start+1));
                        }
                        else{
                            return Double.valueOf(line.substring(start+1, end));
                        }
                    }
                    else{
                        return 0.0;
                    }
                }
            }//end for
            reader.close();
            return 0.0;
        }
        catch(Exception e){
            return -1.0;
        }

    }

    // 测试
    // 计算结点频数的和
    public int calculateNodes(HashMap<String, Integer> nodesMapMCG){
        Iterator<Map.Entry<String, Integer>> iterator=nodesMapMCG.entrySet().iterator();
        int WFCount=0;
        while(iterator.hasNext()){
            //Map.Entry<String, Integer> entry=iterator.next();
            WFCount+=iterator.next().getValue();
        }
        return WFCount;
    }

    // 测试
    // 计算边权值的和
    public double calculateEdges(HashMap<String, HashMap<String, Double>> edgesMapMCG){
        Iterator<Map.Entry<String, HashMap<String, Double>>> nodeIterator=edgesMapMCG.entrySet().iterator();
        double edgesWeightCount=0.0;
        while(nodeIterator.hasNext()){
            Map.Entry<String, HashMap<String, Double>> nodeEntry=nodeIterator.next();
            Iterator<Map.Entry<String, Double>> edgeIterator=nodeEntry.getValue().entrySet().iterator();
            while(edgeIterator.hasNext()){
                edgesWeightCount+=edgeIterator.next().getValue();
            }
        }

        return edgesWeightCount;
    }

    // 最小生成树MinimumSpanningTree
    // Prim算法
    public TreeSet<MSTEdge> createMST(String similarityFolder) {
        // HashMap<String, HashMap<String, Double>> MST=new HashMap<String,
        // HashMap<String, Double>>();// 最小生成树
        TreeSet<MSTEdge> MST = new TreeSet<MSTEdge>();
        HashSet<String> processedSet = new HashSet<String>();// 已处理结点集合
        String curTextID = "0";
        String preTextID = "";// 临时记录最大相似度的文本号
        String sucTextID = "";// 临时记录最大相似度的文本号
        double maxSimilarity = 0.0;// 临时记录最大相似度

        // 从0号文本开始处理
        processedSet.add("0");

        try {
            BufferedWriter writer = new BufferedWriter(new FileWriter(new File(
                    PathClass.MSTFolder + "edges.txt"), false));
            BufferedReader reader;

            // 当已处理结点数!=filesNumber
            while (processedSet.size() < filesSize) {

                maxSimilarity = 0.0;// 添加结点后重置maxSimilarity

                // 逐个获取所有已处理结点
                Iterator<String> iterator = processedSet.iterator();
                while (iterator.hasNext()) {
                    // String processedID=iterator.next();
                    curTextID = iterator.next();

                    // 读取对应的similarity/id.txt文本
                    reader = new BufferedReader(new FileReader(
                            new File(similarityFolder + curTextID + ".txt")));

                    // 逐个读取id.txt文本中id号结点相邻结点
                    for (String line = reader.readLine(); line != null; line = reader
                            .readLine()) {
                        int mark = line.indexOf('\t');

                        // 若该结点未被添加
                        if (!processedSet.contains(line.substring(0, mark))) {
                            String value = line.substring(mark + 1);
                            // 异常处理
                            if (value.equals("?")) {
                                value = "0.0";
                            }

                            // 查询相似度最大的相邻结点
                            if (Double.valueOf(value) >= maxSimilarity) {
                                preTextID = curTextID;// 暂时记录结点号
                                sucTextID = line.substring(0, mark);// 暂时记录结点号
                                maxSimilarity = Double.valueOf(value);// 暂时记录相似度
                            }// end if

                        }// end if
                    }// end for
                    reader.close();

                }// end while

                // 确定尚未添加结点中最大相似度的
                if (sucTextID != "") {
                    processedSet.add(sucTextID);

                    // 记录路径preTextID->sucTextID 权值maxSimilarity

                    MST.add(new MSTEdge(Integer.valueOf(preTextID), Integer
                            .valueOf(sucTextID), maxSimilarity));

                    writer.write(preTextID + "\t" + sucTextID + "\t"
                            + maxSimilarity);
                    writer.newLine();
                    writer.flush();

                    System.out.println("添加次数："+processedSet.size()+" "+preTextID+" "+sucTextID);

                }// end if
            }// end while
            writer.close();
        } catch (Exception e) {
            System.out.println(e.getMessage());
        }
        return MST;
    }

    // 文本聚类算法：指定聚簇数目
    public ArrayList<MSTEdge> clusteringByNumber(ArrayList<MSTEdge> MST, int number) {
        try {
            BufferedWriter writer = new BufferedWriter(new FileWriter(new File(
                    PathClass.MSTFolder + "clusteringByNumber.txt"), false));
            for (; number > 1; number--) {

                System.out.println("指定数目"+MST.get(0).getPreNode()+" "+MST.get(0).getSucNode()+" "+MST.get(0).getWeight()+" "+MST.size());
                MST.remove(0);
            }

            Iterator<MSTEdge> iterator = MST.iterator();
            while (iterator.hasNext()) {
                MSTEdge edge = iterator.next();

                writer.write(edge.getPreNode() + "\t" + edge.getSucNode()
                        + "\t" + edge.getWeight());
                writer.newLine();
                writer.flush();

            }// end while
            writer.close();
        } catch (Exception e) {
            System.out.println(e.getMessage());
        }

        MST.toArray();
        return MST;
    }

    // 文本聚类算法：指定阈值
    public ArrayList<MSTEdge> clusteringByValue(ArrayList<MSTEdge> MST, double value) {
        try {
            BufferedWriter writer = new BufferedWriter(new FileWriter(new File(
                    PathClass.MSTFolder + "clusteringByValue.txt"), false));

            while (MST.get(0).getWeight() < value) {

                System.out.println("指定数目"+MST.get(0).getPreNode()+" "+MST.get(0).getSucNode()+" "+MST.get(0).getWeight()+" "+MST.size());
                MST.remove(0);

            }// end while

            Iterator<MSTEdge> iterator = MST.iterator();
            while (iterator.hasNext()) {
                MSTEdge edge = iterator.next();

                writer.write(edge.getPreNode() + "\t" + edge.getSucNode()
                        + "\t" + edge.getWeight());
                writer.newLine();
                writer.flush();

            }// end while
            writer.close();

        } catch (Exception e) {
            System.out.println(e.getMessage());
        }
        return MST;
    }

    // 显示聚类结果
    public void showCluster(String path, ArrayList<MSTEdge> MST) {
        ClusterSet clusterSet = new ClusterSet();
        Iterator<MSTEdge> iterator = MST.iterator();
        //System.out.println(MST.size());
        while (iterator.hasNext()) {
            clusterSet.addCluster(iterator.next());
        }

        // 查询添加孤立结点
        clusterSet.checkIsolatedNodes(filesSize);

        // 输出结果到文本
        clusterSet.outPutClusterSet(path);
    }


    // 计算文本相似度
    public void getSimilarity() {
        // 求文本i与文本j的最大公共子图
        // 计算相似度输出到文档
        String fileName1="";
        String fileName2="";

        try {
            BufferedWriter writer;
            for (int file1 = 0; file1 < filesSize; file1++) {
                fileName1=filesList.get(file1);
                writer = new BufferedWriter(new FileWriter(new File(
                        PathClass.similarityFolder + fileName1), false));

                //此处优化时可以file2=file1,即下三角矩阵
                for (int file2 = 0; file2 < filesSize; file2++) {
                    fileName2=filesList.get(file2);

                    // 此处可以先判断getMCG(file1, file2)相似度阈值

                    if (!fileName1.equals(fileName2)) {
                        writer.write(fileName2 + "\t"
                                + doubleFormat.format(getMCG(fileName1, fileName2)));
                        writer.newLine();
                        writer.flush();
                    }

                }
                writer.close();
            }
            System.out.println("计算文本相似度");
        } catch (Exception e) {
        }
    }


    // 建立文本映射模型
    public void buildTextModel() {
        String fileName="";

        // 处理fileCount号文本
        for (fileCount = 0; fileCount < filesSize; fileCount++) {
            // 清空边集
            // edgesMap.clear();

            fileName=filesList.get(fileCount);

            //System.out.println(fileCount + "号文件开始处理");

            // 读取结点数据
            nodesMap = getNodes(PathClass.statisticsFolder + fileName);
            edgesMap = BuildEdges(fileName);

            // 记录结点集并输出到文档
            recordNodes(fileName, nodesMap);

            // 输出边集到文档(带频数)
            // outPutEdges(fileCount,edgesMap);

            // 计算边权值
            calculateEdgesWeight(nodesMap, edgesMap);

            // 输出边集到文档(带权值)
            outPutEdges(PathClass.edgesFolder + fileName, edgesMap);

            //System.out.println(fileName + " 文件处理完成");

        }// end for
    }

    //从文本读取MST

    public ArrayList<MSTEdge> readMST(){
        //TreeSet<MSTEdge> MST = new TreeSet<MSTEdge>();
        ArrayList<MSTEdge> MST = new ArrayList<MSTEdge>();
        //int c=0;
        try {
            BufferedReader reader = new BufferedReader(new FileReader(new File(
                    PathClass.MSTFolder + "edges.txt")));
            for (String line = reader.readLine(); line != null; line = reader
                    .readLine()) {
                int mark1 = line.indexOf('\t');
                int mark2 = line.lastIndexOf('\t');
                String preNode = line.substring(0, mark1);
                String sucNode = line.substring(mark1 + 1, mark2);
                String weight = line.substring(mark2 + 1);
                MST.add(new MSTEdge(Integer.valueOf(preNode), Integer
                        .valueOf(sucNode), Double.valueOf(weight)));
                //c++;
                //System.out.println(c+" "+MST.size()+"; "+preNode+" "+sucNode);

            }
            Collections.sort(MST);


        } catch (Exception e) {

        }
        return MST;
    }


    // convert TreeSet To ArrayList
    public ArrayList<MSTEdge> convertTreeSetToArrayList(TreeSet<MSTEdge> ts){



        ArrayList<MSTEdge> al=new ArrayList<MSTEdge>();

        while(!ts.isEmpty()){
            al.add(ts.first());
            ts.remove(ts.first());

        }

        return al;

    }
}
